{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4cad9b8d",
   "metadata": {},
   "source": [
    "#### 导入库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 996,
   "id": "29ba4e5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c796798",
   "metadata": {},
   "source": [
    "#### 查看raw_data目录下所有文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 997,
   "id": "ddde352a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#查看raw_data目录下所有文件\n",
    "input_dir = './raw_data/'\n",
    "files = os.listdir(input_dir)\n",
    "# print(files)\n",
    "#创建列表保存读取的xlsx文件\n",
    "data_list = []\n",
    "#读取每个xlsx文件\n",
    "for file in files:\n",
    "    data_list.append(pd.read_excel(input_dir + file))\n",
    "# print(data_list)\n",
    "# print(len(data_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 998,
   "id": "dd856375",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "52256\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>房屋信息</th>\n",
       "      <th>关注信息</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>3室2厅 | 86.4平米 | 东 | 简装 | 中楼层(共26层) | 2016年建 | 塔楼</td>\n",
       "      <td>135人关注 / 6个月以前发布</td>\n",
       "      <td>91.8万</td>\n",
       "      <td>单价10625元/平米</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>3室1厅 | 66.12平米 | 东 | 毛坯 | 高楼层(共33层) | 2017年建 |...</td>\n",
       "      <td>40人关注 / 2个月以前发布</td>\n",
       "      <td>128.5万</td>\n",
       "      <td>单价19435元/平米</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>3室1厅 | 73.59平米 | 南 | 精装 | 中楼层(共34层) | 2017年建 |...</td>\n",
       "      <td>58人关注 / 15天以前发布</td>\n",
       "      <td>153万</td>\n",
       "      <td>单价20791元/平米</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>2室1厅 | 75.4平米 | 东 北 | 简装 | 中楼层(共16层) | 2011年建 ...</td>\n",
       "      <td>36人关注 / 2个月以前发布</td>\n",
       "      <td>89万</td>\n",
       "      <td>单价11804元/平米</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>1室1厅 | 51.69平米 | 南 | 精装 | 高楼层(共15层) | 2007年建 | 板楼</td>\n",
       "      <td>38人关注 / 1个月以前发布</td>\n",
       "      <td>64.5万</td>\n",
       "      <td>单价12479元/平米</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流   \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流   \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流   \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流   \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   \n",
       "\n",
       "                                                房屋信息              关注信息  \\\n",
       "0   3室2厅 | 86.4平米 | 东 | 简装 | 中楼层(共26层) | 2016年建 | 塔楼  135人关注 / 6个月以前发布   \n",
       "1  3室1厅 | 66.12平米 | 东 | 毛坯 | 高楼层(共33层) | 2017年建 |...   40人关注 / 2个月以前发布   \n",
       "2  3室1厅 | 73.59平米 | 南 | 精装 | 中楼层(共34层) | 2017年建 |...   58人关注 / 15天以前发布   \n",
       "3  2室1厅 | 75.4平米 | 东 北 | 简装 | 中楼层(共16层) | 2011年建 ...   36人关注 / 2个月以前发布   \n",
       "4  1室1厅 | 51.69平米 | 南 | 精装 | 高楼层(共15层) | 2007年建 | 板楼   38人关注 / 1个月以前发布   \n",
       "\n",
       "       总价           单价  \n",
       "0   91.8万  单价10625元/平米  \n",
       "1  128.5万  单价19435元/平米  \n",
       "2    153万  单价20791元/平米  \n",
       "3     89万  单价11804元/平米  \n",
       "4   64.5万  单价12479元/平米  "
      ]
     },
     "execution_count": 998,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#合并所有文件\n",
    "data = pd.concat(data_list)\n",
    "# print(data)\n",
    "# 重建列索引\n",
    "data = data.reset_index()\n",
    "# data.tail(5)\n",
    "# print('*******************')\n",
    "# print(data)\n",
    "data = data.drop('index',axis=1)\n",
    "# print(data)\n",
    "#查看数据记录个数\n",
    "print(len(data))\n",
    "#预览读入文件中的前5条记录\n",
    "data.head(5)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 999,
   "id": "0e503e42",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0        7\n",
      "1        7\n",
      "2        7\n",
      "3        7\n",
      "4        7\n",
      "        ..\n",
      "52251    7\n",
      "52252    6\n",
      "52253    6\n",
      "52254    7\n",
      "52255    6\n",
      "Name: 房屋信息, Length: 52256, dtype: int64\n",
      "房屋信息\n",
      "7    39045\n",
      "6    12669\n",
      "8      370\n",
      "4      122\n",
      "5       50\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#检查每条房屋信息记录中保存的信息个数\n",
    "nrec = data.房屋信息.map(lambda x:len(x.split('|')))\n",
    "print(nrec)\n",
    "nrec.value_counts()\n",
    "# 查看每条记录中保存的信息个数的分布\n",
    "print(nrec.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1000,
   "id": "5a7198ca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                           描述     位置信息   区域  \\\n",
      "0           威兰德装修套三对中庭，客户只给契税   威兰德小镇    双流   \n",
      "1      房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期    双流   \n",
      "2         南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期    双流   \n",
      "3           佰客郡精装修房子配套成熟业主真心卖     佰客郡    双流   \n",
      "4          加贝书香尚品 精装修 带家具家电出售  加贝书香尚品    双流   \n",
      "...                       ...      ...  ...   \n",
      "52248        居家装修大套三，双卫，可拎包入住    春秋名邸   龙泉驿   \n",
      "52249         标准套二，中间楼层，户型方正。    春秋家院   龙泉驿   \n",
      "52250  新上房源  精致装修  户型好  业主诚心卖    四季映像   龙泉驿   \n",
      "52251         装修套三，业主置换。看小区中庭    林溪康城   龙泉驿   \n",
      "52254       林溪康城+清水+套三双卫+户型方正    林溪康城   龙泉驿   \n",
      "\n",
      "                                                    房屋信息              关注信息  \\\n",
      "0       3室2厅 | 86.4平米 | 东 | 简装 | 中楼层(共26层) | 2016年建 | 塔楼  135人关注 / 6个月以前发布   \n",
      "1      3室1厅 | 66.12平米 | 东 | 毛坯 | 高楼层(共33层) | 2017年建 |...   40人关注 / 2个月以前发布   \n",
      "2      3室1厅 | 73.59平米 | 南 | 精装 | 中楼层(共34层) | 2017年建 |...   58人关注 / 15天以前发布   \n",
      "3      2室1厅 | 75.4平米 | 东 北 | 简装 | 中楼层(共16层) | 2011年建 ...   36人关注 / 2个月以前发布   \n",
      "4      1室1厅 | 51.69平米 | 南 | 精装 | 高楼层(共15层) | 2007年建 | 板楼   38人关注 / 1个月以前发布   \n",
      "...                                                  ...               ...   \n",
      "52248  3室2厅 | 116.72平米 | 东南 | 其他 | 中楼层(共22层) | 2012年建...    4人关注 / 2个月以前发布   \n",
      "52249     2室2厅 | 80平米 | 东南 | 简装 | 中楼层(共7层) | 2003年建 | 板楼   18人关注 / 2个月以前发布   \n",
      "52250  3室2厅 | 85.72平米 | 东南 | 精装 | 低楼层(共18层) | 2012年建 ...    8人关注 / 2个月以前发布   \n",
      "52251  3室2厅 | 90.9平米 | 东 | 精装 | 低楼层(共34层) | 2017年建 | ...   13人关注 / 2个月以前发布   \n",
      "52254  3室2厅 | 89.35平米 | 东南 | 毛坯 | 低楼层(共34层) | 2017年建 ...    2人关注 / 2个月以前发布   \n",
      "\n",
      "           总价           单价  \n",
      "0       91.8万  单价10625元/平米  \n",
      "1      128.5万  单价19435元/平米  \n",
      "2        153万  单价20791元/平米  \n",
      "3         89万  单价11804元/平米  \n",
      "4       64.5万  单价12479元/平米  \n",
      "...       ...          ...  \n",
      "52248    130万  单价11138元/平米  \n",
      "52249     83万  单价10375元/平米  \n",
      "52250    138万  单价16099元/平米  \n",
      "52251     98万  单价10782元/平米  \n",
      "52254     93万  单价10409元/平米  \n",
      "\n",
      "[39045 rows x 7 columns]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "39045"
      ]
     },
     "execution_count": 1000,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 只保留有7条信息的记录\n",
    "data = data[nrec == 7]\n",
    "print(data)\n",
    "# 打印现有记录长度\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1001,
   "id": "79444962",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>关注信息</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>户型</th>\n",
       "      <th>面积</th>\n",
       "      <th>朝向</th>\n",
       "      <th>类型</th>\n",
       "      <th>楼层</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>结构</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>135人关注 / 6个月以前发布</td>\n",
       "      <td>91.8万</td>\n",
       "      <td>单价10625元/平米</td>\n",
       "      <td>3室2厅</td>\n",
       "      <td>86.4平米</td>\n",
       "      <td>东</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共26层)</td>\n",
       "      <td>2016年建</td>\n",
       "      <td>塔楼</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>40人关注 / 2个月以前发布</td>\n",
       "      <td>128.5万</td>\n",
       "      <td>单价19435元/平米</td>\n",
       "      <td>3室1厅</td>\n",
       "      <td>66.12平米</td>\n",
       "      <td>东</td>\n",
       "      <td>毛坯</td>\n",
       "      <td>高楼层(共33层)</td>\n",
       "      <td>2017年建</td>\n",
       "      <td>板塔结合</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>58人关注 / 15天以前发布</td>\n",
       "      <td>153万</td>\n",
       "      <td>单价20791元/平米</td>\n",
       "      <td>3室1厅</td>\n",
       "      <td>73.59平米</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>中楼层(共34层)</td>\n",
       "      <td>2017年建</td>\n",
       "      <td>板塔结合</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>36人关注 / 2个月以前发布</td>\n",
       "      <td>89万</td>\n",
       "      <td>单价11804元/平米</td>\n",
       "      <td>2室1厅</td>\n",
       "      <td>75.4平米</td>\n",
       "      <td>东 北</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共16层)</td>\n",
       "      <td>2011年建</td>\n",
       "      <td>板楼</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>38人关注 / 1个月以前发布</td>\n",
       "      <td>64.5万</td>\n",
       "      <td>单价12479元/平米</td>\n",
       "      <td>1室1厅</td>\n",
       "      <td>51.69平米</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>高楼层(共15层)</td>\n",
       "      <td>2007年建</td>\n",
       "      <td>板楼</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域              关注信息      总价           单价  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流  135人关注 / 6个月以前发布   91.8万  单价10625元/平米   \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流   40人关注 / 2个月以前发布  128.5万  单价19435元/平米   \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流   58人关注 / 15天以前发布    153万  单价20791元/平米   \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流   36人关注 / 2个月以前发布     89万  单价11804元/平米   \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   38人关注 / 1个月以前发布   64.5万  单价12479元/平米   \n",
       "\n",
       "      户型         面积     朝向    类型           楼层      建成时间     结构  \n",
       "0  3室2厅     86.4平米      东    简装    中楼层(共26层)    2016年建      塔楼  \n",
       "1  3室1厅    66.12平米      东    毛坯    高楼层(共33层)    2017年建    板塔结合  \n",
       "2  3室1厅    73.59平米      南    精装    中楼层(共34层)    2017年建    板塔结合  \n",
       "3  2室1厅     75.4平米    东 北    简装    中楼层(共16层)    2011年建      板楼  \n",
       "4  1室1厅    51.69平米      南    精装    高楼层(共15层)    2007年建      板楼  "
      ]
     },
     "execution_count": 1001,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 拆分房屋信息所在列，并将拆分结果增至新标签列\n",
    "data['户型'] = data.房屋信息.map(lambda x:x.split('|')[0])\n",
    "data['面积'] = data.房屋信息.map(lambda x:x.split('|')[1])\n",
    "data['朝向'] = data.房屋信息.map(lambda x:x.split('|')[2])\n",
    "data['类型'] = data.房屋信息.map(lambda x:x.split('|')[3])\n",
    "data['楼层'] = data.房屋信息.map(lambda x:x.split('|')[4])\n",
    "data['建成时间'] = data.房屋信息.map(lambda x:x.split('|')[5])\n",
    "data['结构'] = data.房屋信息.map(lambda x:x.split('|')[6])\n",
    "# 删除房屋信息列\n",
    "data = data.drop('房屋信息', axis=1)\n",
    "data.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1002,
   "id": "53933603",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0        2\n",
      "1        2\n",
      "2        2\n",
      "3        2\n",
      "4        2\n",
      "        ..\n",
      "52248    2\n",
      "52249    2\n",
      "52250    2\n",
      "52251    2\n",
      "52254    2\n",
      "Name: 关注信息, Length: 39045, dtype: int64\n",
      "关注信息\n",
      "2    39045\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#检索每条关注信息记录中保存的信息个数\n",
    "nrec = data.关注信息.map(lambda x:len(x.split('/')))\n",
    "print(nrec)\n",
    "# 查看每条记录中保存的信息个数的分布\n",
    "nrec.value_counts()\n",
    "print(nrec.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1003,
   "id": "a2242c94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>户型</th>\n",
       "      <th>面积</th>\n",
       "      <th>朝向</th>\n",
       "      <th>类型</th>\n",
       "      <th>楼层</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>结构</th>\n",
       "      <th>关注人数</th>\n",
       "      <th>发布时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>91.8万</td>\n",
       "      <td>单价10625元/平米</td>\n",
       "      <td>3室2厅</td>\n",
       "      <td>86.4平米</td>\n",
       "      <td>东</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共26层)</td>\n",
       "      <td>2016年建</td>\n",
       "      <td>塔楼</td>\n",
       "      <td>135人关注</td>\n",
       "      <td>6个月以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>128.5万</td>\n",
       "      <td>单价19435元/平米</td>\n",
       "      <td>3室1厅</td>\n",
       "      <td>66.12平米</td>\n",
       "      <td>东</td>\n",
       "      <td>毛坯</td>\n",
       "      <td>高楼层(共33层)</td>\n",
       "      <td>2017年建</td>\n",
       "      <td>板塔结合</td>\n",
       "      <td>40人关注</td>\n",
       "      <td>2个月以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>153万</td>\n",
       "      <td>单价20791元/平米</td>\n",
       "      <td>3室1厅</td>\n",
       "      <td>73.59平米</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>中楼层(共34层)</td>\n",
       "      <td>2017年建</td>\n",
       "      <td>板塔结合</td>\n",
       "      <td>58人关注</td>\n",
       "      <td>15天以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>89万</td>\n",
       "      <td>单价11804元/平米</td>\n",
       "      <td>2室1厅</td>\n",
       "      <td>75.4平米</td>\n",
       "      <td>东 北</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共16层)</td>\n",
       "      <td>2011年建</td>\n",
       "      <td>板楼</td>\n",
       "      <td>36人关注</td>\n",
       "      <td>2个月以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>64.5万</td>\n",
       "      <td>单价12479元/平米</td>\n",
       "      <td>1室1厅</td>\n",
       "      <td>51.69平米</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>高楼层(共15层)</td>\n",
       "      <td>2007年建</td>\n",
       "      <td>板楼</td>\n",
       "      <td>38人关注</td>\n",
       "      <td>1个月以前发布</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域      总价           单价     户型         面积  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流   91.8万  单价10625元/平米  3室2厅     86.4平米    \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流  128.5万  单价19435元/平米  3室1厅    66.12平米    \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流    153万  单价20791元/平米  3室1厅    73.59平米    \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流     89万  单价11804元/平米  2室1厅     75.4平米    \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   64.5万  单价12479元/平米  1室1厅    51.69平米    \n",
       "\n",
       "      朝向    类型           楼层      建成时间     结构     关注人数      发布时间  \n",
       "0     东    简装    中楼层(共26层)    2016年建      塔楼  135人关注    6个月以前发布  \n",
       "1     东    毛坯    高楼层(共33层)    2017年建    板塔结合   40人关注    2个月以前发布  \n",
       "2     南    精装    中楼层(共34层)    2017年建    板塔结合   58人关注    15天以前发布  \n",
       "3   东 北    简装    中楼层(共16层)    2011年建      板楼   36人关注    2个月以前发布  \n",
       "4     南    精装    高楼层(共15层)    2007年建      板楼   38人关注    1个月以前发布  "
      ]
     },
     "execution_count": 1003,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#拆分关注信息所在列，并将拆分结果增值新标签列\n",
    "data['关注人数'] = data.关注信息.map(lambda x:x.split('/')[0])\n",
    "data['发布时间'] = data.关注信息.map(lambda x:x.split('/')[1])\n",
    "# 删除关注信息列\n",
    "data = data.drop('关注信息', axis=1)\n",
    "# 查看数据\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1004,
   "id": "3757100a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>户型</th>\n",
       "      <th>面积</th>\n",
       "      <th>朝向</th>\n",
       "      <th>类型</th>\n",
       "      <th>楼层</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>结构</th>\n",
       "      <th>关注人数</th>\n",
       "      <th>发布时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52248</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52249</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52250</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52251</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52254</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>39045 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          描述   位置信息     区域     总价     单价     户型     面积     朝向     类型     楼层  \\\n",
       "0      False  False  False  False  False  False  False  False  False  False   \n",
       "1      False  False  False  False  False  False  False  False  False  False   \n",
       "2      False  False  False  False  False  False  False  False  False  False   \n",
       "3      False  False  False  False  False  False  False  False  False  False   \n",
       "4      False  False  False  False  False  False  False  False  False  False   \n",
       "...      ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   \n",
       "52248  False  False  False  False  False  False  False  False  False  False   \n",
       "52249  False  False  False  False  False  False  False  False  False  False   \n",
       "52250  False  False  False  False  False  False  False  False  False  False   \n",
       "52251  False  False  False  False  False  False  False  False  False  False   \n",
       "52254  False  False  False  False  False  False  False  False  False  False   \n",
       "\n",
       "        建成时间     结构   关注人数   发布时间  \n",
       "0      False  False  False  False  \n",
       "1      False  False  False  False  \n",
       "2      False  False  False  False  \n",
       "3      False  False  False  False  \n",
       "4      False  False  False  False  \n",
       "...      ...    ...    ...    ...  \n",
       "52248  False  False  False  False  \n",
       "52249  False  False  False  False  \n",
       "52250  False  False  False  False  \n",
       "52251  False  False  False  False  \n",
       "52254  False  False  False  False  \n",
       "\n",
       "[39045 rows x 14 columns]"
      ]
     },
     "execution_count": 1004,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#缺失值检查\n",
    "data.isnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1005,
   "id": "c2949469",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "描述      0\n",
       "位置信息    0\n",
       "区域      0\n",
       "总价      0\n",
       "单价      0\n",
       "户型      0\n",
       "面积      0\n",
       "朝向      0\n",
       "类型      0\n",
       "楼层      0\n",
       "建成时间    0\n",
       "结构      0\n",
       "关注人数    0\n",
       "发布时间    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 1005,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#统计缺失值个数\n",
    "(data.isnull()).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1006,
   "id": "a8637279",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.int64(16)"
      ]
     },
     "execution_count": 1006,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#检查重复值\n",
    "(data.duplicated()).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1007,
   "id": "96338509",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['单价元平米'], dtype=object)"
      ]
     },
     "execution_count": 1007,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 使用正则表达式查看单价列中含有的中文字符种类\n",
    "data.单价.map(lambda x: re.sub('[^\\u4E00-\\u9FA5]','',x)).unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1008,
   "id": "78e3bee8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>户型</th>\n",
       "      <th>面积</th>\n",
       "      <th>朝向</th>\n",
       "      <th>类型</th>\n",
       "      <th>楼层</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>结构</th>\n",
       "      <th>关注人数</th>\n",
       "      <th>发布时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>91.80</td>\n",
       "      <td>单价10625元/平米</td>\n",
       "      <td>3室2厅</td>\n",
       "      <td>86.4平米</td>\n",
       "      <td>东</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共26层)</td>\n",
       "      <td>2016年建</td>\n",
       "      <td>塔楼</td>\n",
       "      <td>135人关注</td>\n",
       "      <td>6个月以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>128.50</td>\n",
       "      <td>单价19435元/平米</td>\n",
       "      <td>3室1厅</td>\n",
       "      <td>66.12平米</td>\n",
       "      <td>东</td>\n",
       "      <td>毛坯</td>\n",
       "      <td>高楼层(共33层)</td>\n",
       "      <td>2017年建</td>\n",
       "      <td>板塔结合</td>\n",
       "      <td>40人关注</td>\n",
       "      <td>2个月以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>153.00</td>\n",
       "      <td>单价20791元/平米</td>\n",
       "      <td>3室1厅</td>\n",
       "      <td>73.59平米</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>中楼层(共34层)</td>\n",
       "      <td>2017年建</td>\n",
       "      <td>板塔结合</td>\n",
       "      <td>58人关注</td>\n",
       "      <td>15天以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>89.00</td>\n",
       "      <td>单价11804元/平米</td>\n",
       "      <td>2室1厅</td>\n",
       "      <td>75.4平米</td>\n",
       "      <td>东 北</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共16层)</td>\n",
       "      <td>2011年建</td>\n",
       "      <td>板楼</td>\n",
       "      <td>36人关注</td>\n",
       "      <td>2个月以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>64.50</td>\n",
       "      <td>单价12479元/平米</td>\n",
       "      <td>1室1厅</td>\n",
       "      <td>51.69平米</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>高楼层(共15层)</td>\n",
       "      <td>2007年建</td>\n",
       "      <td>板楼</td>\n",
       "      <td>38人关注</td>\n",
       "      <td>1个月以前发布</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域      总价           单价     户型         面积  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流   91.80  单价10625元/平米  3室2厅     86.4平米    \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流  128.50  单价19435元/平米  3室1厅    66.12平米    \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流  153.00  单价20791元/平米  3室1厅    73.59平米    \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流   89.00  单价11804元/平米  2室1厅     75.4平米    \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   64.50  单价12479元/平米  1室1厅    51.69平米    \n",
       "\n",
       "      朝向    类型           楼层      建成时间     结构     关注人数      发布时间  \n",
       "0     东    简装    中楼层(共26层)    2016年建      塔楼  135人关注    6个月以前发布  \n",
       "1     东    毛坯    高楼层(共33层)    2017年建    板塔结合   40人关注    2个月以前发布  \n",
       "2     南    精装    中楼层(共34层)    2017年建    板塔结合   58人关注    15天以前发布  \n",
       "3   东 北    简装    中楼层(共16层)    2011年建      板楼   36人关注    2个月以前发布  \n",
       "4     南    精装    高楼层(共15层)    2007年建      板楼   38人关注    1个月以前发布  "
      ]
     },
     "execution_count": 1008,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删去字符“万”，并将类型转换为float，并保留两位小数\n",
    "data['总价'] = data.总价.map(lambda x: re.sub('万','',x))\n",
    "data['总价'] = data.总价.map(lambda x: format(float(x), '.2f'))\n",
    "data.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1009,
   "id": "5bba2670",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['单价元平米']\n",
      "['平米']\n",
      "['年建' '板塔结合' '板楼' '暂无数据' '塔楼']\n",
      "['人关注']\n"
     ]
    }
   ],
   "source": [
    "# 依次检查其他列所含的中文字符\n",
    "print(data.单价.map(lambda x : re.sub('[^\\u4E00-\\u9FA5]','',x)).unique())\n",
    "print(data.面积.map(lambda x : re.sub('[^\\u4E00-\\u9FA5]','',x)).unique())\n",
    "print(data.建成时间.map(lambda x : re.sub('[^\\u4E00-\\u9FA5]','',x)).unique())\n",
    "print(data.关注人数.map(lambda x : re.sub('[^\\u4E00-\\u9FA5]','',x)).unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1010,
   "id": "ee1fa672",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "245"
      ]
     },
     "execution_count": 1010,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#建成时间列不包含关键字'年建'的记录数\n",
    "len(data[~data.建成时间.str.contains('年建')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1011,
   "id": "83f6697d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "38800"
      ]
     },
     "execution_count": 1011,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#只保留含关键字'年建'的记录\n",
    "data = data[data.建成时间.str.contains('年建')]\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1012,
   "id": "2a26fe29",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>户型</th>\n",
       "      <th>面积</th>\n",
       "      <th>朝向</th>\n",
       "      <th>类型</th>\n",
       "      <th>楼层</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>结构</th>\n",
       "      <th>关注人数</th>\n",
       "      <th>发布时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>91.80</td>\n",
       "      <td>1.06</td>\n",
       "      <td>3室2厅</td>\n",
       "      <td>86.40</td>\n",
       "      <td>东</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共26层)</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>塔楼</td>\n",
       "      <td>135.0</td>\n",
       "      <td>6个月以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>128.50</td>\n",
       "      <td>1.94</td>\n",
       "      <td>3室1厅</td>\n",
       "      <td>66.12</td>\n",
       "      <td>东</td>\n",
       "      <td>毛坯</td>\n",
       "      <td>高楼层(共33层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>板塔结合</td>\n",
       "      <td>40.0</td>\n",
       "      <td>2个月以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>153.00</td>\n",
       "      <td>2.08</td>\n",
       "      <td>3室1厅</td>\n",
       "      <td>73.59</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>中楼层(共34层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>板塔结合</td>\n",
       "      <td>58.0</td>\n",
       "      <td>15天以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>89.00</td>\n",
       "      <td>1.18</td>\n",
       "      <td>2室1厅</td>\n",
       "      <td>75.40</td>\n",
       "      <td>东 北</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共16层)</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>板楼</td>\n",
       "      <td>36.0</td>\n",
       "      <td>2个月以前发布</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>64.50</td>\n",
       "      <td>1.25</td>\n",
       "      <td>1室1厅</td>\n",
       "      <td>51.69</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>高楼层(共15层)</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>板楼</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1个月以前发布</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域      总价    单价     户型     面积     朝向  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流   91.80  1.06  3室2厅   86.40     东    \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流  128.50  1.94  3室1厅   66.12     东    \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流  153.00  2.08  3室1厅   73.59     南    \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流   89.00  1.18  2室1厅   75.40   东 北    \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   64.50  1.25  1室1厅   51.69     南    \n",
       "\n",
       "     类型           楼层    建成时间     结构   关注人数      发布时间  \n",
       "0   简装    中楼层(共26层)   2016.0     塔楼  135.0   6个月以前发布  \n",
       "1   毛坯    高楼层(共33层)   2017.0   板塔结合   40.0   2个月以前发布  \n",
       "2   精装    中楼层(共34层)   2017.0   板塔结合   58.0   15天以前发布  \n",
       "3   简装    中楼层(共16层)   2011.0     板楼   36.0   2个月以前发布  \n",
       "4   精装    高楼层(共15层)   2007.0     板楼   38.0   1个月以前发布  "
      ]
     },
     "execution_count": 1012,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将单价列转换为float类型\n",
    "data['单价'] = data.单价.map(lambda x : round(float(re.findall(r'单价(.*?)元/平米',x)[0])/10000,2))\n",
    "# 将面积、建成时间和关注人数列转换为float类型\n",
    "data['面积'] = data.面积.map(lambda x : round(float(x.replace('平米','')),2))\n",
    "data['建成时间'] = data.建成时间.map(lambda x : float(x.replace('年建','')))\n",
    "data['关注人数'] = data.关注人数.map(lambda x : float(x.replace('人关注','')))\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1013,
   "id": "1da8aaf0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['3室2厅 ', '3室1厅 ', '2室1厅 ', '1室1厅 ', '2室2厅 ', '4室2厅 ', '5室2厅 ',\n",
       "       '4室1厅 ', '1室2厅 ', '3室3厅 ', '1室0厅 ', '5室1厅 ', '6室2厅 ', '4室3厅 ',\n",
       "       '6室1厅 ', '3室0厅 ', '6室4厅 ', '5室3厅 ', '4室4厅 ', '6室3厅 ', '7室2厅 ',\n",
       "       '7室3厅 ', '2室0厅 ', '4室0厅 ', '5室0厅 ', '8室2厅 ', '3室4厅 ', '7室1厅 ',\n",
       "       '7室4厅 ', '9室2厅 ', '8室3厅 ', '5室4厅 ', '0室1厅 ', '7室5厅 ', '0室0厅 '],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 1013,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看户型有多少种类\n",
    "data.户型.unique()\n",
    "# print(len(data.户型.unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1014,
   "id": "724e5a6b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>面积</th>\n",
       "      <th>朝向</th>\n",
       "      <th>类型</th>\n",
       "      <th>楼层</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>...</th>\n",
       "      <th>6室3厅</th>\n",
       "      <th>6室4厅</th>\n",
       "      <th>7室1厅</th>\n",
       "      <th>7室2厅</th>\n",
       "      <th>7室3厅</th>\n",
       "      <th>7室4厅</th>\n",
       "      <th>7室5厅</th>\n",
       "      <th>8室2厅</th>\n",
       "      <th>8室3厅</th>\n",
       "      <th>9室2厅</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>91.80</td>\n",
       "      <td>1.06</td>\n",
       "      <td>86.40</td>\n",
       "      <td>东</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共26层)</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>128.50</td>\n",
       "      <td>1.94</td>\n",
       "      <td>66.12</td>\n",
       "      <td>东</td>\n",
       "      <td>毛坯</td>\n",
       "      <td>高楼层(共33层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>153.00</td>\n",
       "      <td>2.08</td>\n",
       "      <td>73.59</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>中楼层(共34层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>89.00</td>\n",
       "      <td>1.18</td>\n",
       "      <td>75.40</td>\n",
       "      <td>东 北</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共16层)</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>64.50</td>\n",
       "      <td>1.25</td>\n",
       "      <td>51.69</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>高楼层(共15层)</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域      总价    单价     面积     朝向    类型  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流   91.80  1.06  86.40     东    简装    \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流  128.50  1.94  66.12     东    毛坯    \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流  153.00  2.08  73.59     南    精装    \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流   89.00  1.18  75.40   东 北    简装    \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   64.50  1.25  51.69     南    精装    \n",
       "\n",
       "            楼层    建成时间  ...  6室3厅   6室4厅   7室1厅   7室2厅   7室3厅   7室4厅   7室5厅   \\\n",
       "0   中楼层(共26层)   2016.0  ...  False  False  False  False  False  False  False   \n",
       "1   高楼层(共33层)   2017.0  ...  False  False  False  False  False  False  False   \n",
       "2   中楼层(共34层)   2017.0  ...  False  False  False  False  False  False  False   \n",
       "3   中楼层(共16层)   2011.0  ...  False  False  False  False  False  False  False   \n",
       "4   高楼层(共15层)   2007.0  ...  False  False  False  False  False  False  False   \n",
       "\n",
       "   8室2厅   8室3厅   9室2厅   \n",
       "0  False  False  False  \n",
       "1  False  False  False  \n",
       "2  False  False  False  \n",
       "3  False  False  False  \n",
       "4  False  False  False  \n",
       "\n",
       "[5 rows x 48 columns]"
      ]
     },
     "execution_count": 1014,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#对户型使用独热编码并加入导员有数据帧中\n",
    "data = data.join(pd.get_dummies(data.户型))\n",
    "#删除原有列\n",
    "data = data.drop('户型',axis = 1)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1015,
   "id": "c5486b20",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['双流' '大邑' '天府新区' '天府新区南区' '崇州' '彭州' '成华' '新津' '新都' '武侯' '温江' '简阳' '蒲江'\n",
      " '郫都' '都江堰' '金堂' '金牛' '锦江' '青白江' '青羊' '高新' '高新西' '龙泉驿']\n"
     ]
    }
   ],
   "source": [
    "#查看区域有多少种类\n",
    "print(data.区域.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1016,
   "id": "dbdf5d0b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>面积</th>\n",
       "      <th>朝向</th>\n",
       "      <th>类型</th>\n",
       "      <th>楼层</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>...</th>\n",
       "      <th>郫都</th>\n",
       "      <th>都江堰</th>\n",
       "      <th>金堂</th>\n",
       "      <th>金牛</th>\n",
       "      <th>锦江</th>\n",
       "      <th>青白江</th>\n",
       "      <th>青羊</th>\n",
       "      <th>高新</th>\n",
       "      <th>高新西</th>\n",
       "      <th>龙泉驿</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>91.80</td>\n",
       "      <td>1.06</td>\n",
       "      <td>86.40</td>\n",
       "      <td>东</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共26层)</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>128.50</td>\n",
       "      <td>1.94</td>\n",
       "      <td>66.12</td>\n",
       "      <td>东</td>\n",
       "      <td>毛坯</td>\n",
       "      <td>高楼层(共33层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>153.00</td>\n",
       "      <td>2.08</td>\n",
       "      <td>73.59</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>中楼层(共34层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>89.00</td>\n",
       "      <td>1.18</td>\n",
       "      <td>75.40</td>\n",
       "      <td>东 北</td>\n",
       "      <td>简装</td>\n",
       "      <td>中楼层(共16层)</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>64.50</td>\n",
       "      <td>1.25</td>\n",
       "      <td>51.69</td>\n",
       "      <td>南</td>\n",
       "      <td>精装</td>\n",
       "      <td>高楼层(共15层)</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 71 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域      总价    单价     面积     朝向    类型  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流   91.80  1.06  86.40     东    简装    \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流  128.50  1.94  66.12     东    毛坯    \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流  153.00  2.08  73.59     南    精装    \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流   89.00  1.18  75.40   东 北    简装    \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   64.50  1.25  51.69     南    精装    \n",
       "\n",
       "            楼层    建成时间  ...     郫都    都江堰     金堂     金牛     锦江    青白江     青羊  \\\n",
       "0   中楼层(共26层)   2016.0  ...  False  False  False  False  False  False  False   \n",
       "1   高楼层(共33层)   2017.0  ...  False  False  False  False  False  False  False   \n",
       "2   中楼层(共34层)   2017.0  ...  False  False  False  False  False  False  False   \n",
       "3   中楼层(共16层)   2011.0  ...  False  False  False  False  False  False  False   \n",
       "4   高楼层(共15层)   2007.0  ...  False  False  False  False  False  False  False   \n",
       "\n",
       "      高新    高新西    龙泉驿  \n",
       "0  False  False  False  \n",
       "1  False  False  False  \n",
       "2  False  False  False  \n",
       "3  False  False  False  \n",
       "4  False  False  False  \n",
       "\n",
       "[5 rows x 71 columns]"
      ]
     },
     "execution_count": 1016,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#对使区域用独热编码并加入导员有数据帧中\n",
    "data = data.join(pd.get_dummies(data.区域))\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1017,
   "id": "1e40a9eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[' 简装 ' ' 毛坯 ' ' 精装 ' ' 其他 ']\n",
      "[' 塔楼' ' 板塔结合' ' 板楼' ' 暂无数据' ' 平房']\n"
     ]
    }
   ],
   "source": [
    "#查看类型有多少种类\n",
    "print(data.类型.unique())\n",
    "#查看结构有多少种类\n",
    "print(data.结构.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1018,
   "id": "3931e0ef",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5068\n",
      "683\n"
     ]
    }
   ],
   "source": [
    "#去掉字符串前后空格\n",
    "data['类型']=data.类型.str.strip()\n",
    "data['结构']=data.结构.str.strip()\n",
    "#查看类型为其他的记录个数\n",
    "print(len(data[data.类型 == '其他']))\n",
    "#查看结构为暂无数据的记录个数\n",
    "print(len(data[data.结构 == '暂无数据']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1019,
   "id": "60bf33ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>面积</th>\n",
       "      <th>朝向</th>\n",
       "      <th>楼层</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>关注人数</th>\n",
       "      <th>...</th>\n",
       "      <th>高新</th>\n",
       "      <th>高新西</th>\n",
       "      <th>龙泉驿</th>\n",
       "      <th>毛坯</th>\n",
       "      <th>简装</th>\n",
       "      <th>精装</th>\n",
       "      <th>塔楼</th>\n",
       "      <th>平房</th>\n",
       "      <th>板塔结合</th>\n",
       "      <th>板楼</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>91.80</td>\n",
       "      <td>1.06</td>\n",
       "      <td>86.40</td>\n",
       "      <td>东</td>\n",
       "      <td>中楼层(共26层)</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>128.50</td>\n",
       "      <td>1.94</td>\n",
       "      <td>66.12</td>\n",
       "      <td>东</td>\n",
       "      <td>高楼层(共33层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>153.00</td>\n",
       "      <td>2.08</td>\n",
       "      <td>73.59</td>\n",
       "      <td>南</td>\n",
       "      <td>中楼层(共34层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>58.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>89.00</td>\n",
       "      <td>1.18</td>\n",
       "      <td>75.40</td>\n",
       "      <td>东 北</td>\n",
       "      <td>中楼层(共16层)</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>36.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>64.50</td>\n",
       "      <td>1.25</td>\n",
       "      <td>51.69</td>\n",
       "      <td>南</td>\n",
       "      <td>高楼层(共15层)</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>38.0</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 76 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域      总价    单价     面积     朝向  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流   91.80  1.06  86.40     东    \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流  128.50  1.94  66.12     东    \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流  153.00  2.08  73.59     南    \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流   89.00  1.18  75.40   东 北    \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   64.50  1.25  51.69     南    \n",
       "\n",
       "            楼层    建成时间   关注人数  ...     高新    高新西    龙泉驿     毛坯     简装     精装  \\\n",
       "0   中楼层(共26层)   2016.0  135.0  ...  False  False  False  False   True  False   \n",
       "1   高楼层(共33层)   2017.0   40.0  ...  False  False  False   True  False  False   \n",
       "2   中楼层(共34层)   2017.0   58.0  ...  False  False  False  False  False   True   \n",
       "3   中楼层(共16层)   2011.0   36.0  ...  False  False  False  False   True  False   \n",
       "4   高楼层(共15层)   2007.0   38.0  ...  False  False  False  False  False   True   \n",
       "\n",
       "      塔楼     平房   板塔结合     板楼  \n",
       "0   True  False  False  False  \n",
       "1  False  False   True  False  \n",
       "2  False  False   True  False  \n",
       "3  False  False  False   True  \n",
       "4  False  False  False   True  \n",
       "\n",
       "[5 rows x 76 columns]"
      ]
     },
     "execution_count": 1019,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#丢弃无效数据\n",
    "data = data[(data.类型 !='其他')&(data.结构 != '暂无数据')]\n",
    "data = data.join(pd.get_dummies(data.类型))\n",
    "data = data.join(pd.get_dummies(data.结构))\n",
    "#删除原有列\n",
    "data = data.drop('类型',axis =1)\n",
    "data = data.drop('结构',axis =1)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1020,
   "id": "2be3cfd1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([' 东 ', ' 南 ', ' 东 北 ', ' 西北 ', ' 西 ', ' 南 北 ', ' 东 西 ', ' 东北 ',\n",
       "       ' 东南 ', ' 北 ', ' 西南 ', ' 西北 北 ', ' 西南 西 ', ' 东 南 ', ' 南 西南 ',\n",
       "       ' 东南 南 ', ' 西 西北 ', ' 东 东南 ', ' 东南 西 ', ' 南 西 ', ' 东南 西南 ',\n",
       "       ' 西南 东北 ', ' 东 西南 ', ' 北 南 ', ' 东 东南 南 ', ' 西 西南 ', ' 西 北 ',\n",
       "       ' 西南 北 ', ' 北 东北 ', ' 东南 北 ', ' 东南 东北 ', ' 西南 南 东南 ', ' 东 北 西南 ',\n",
       "       ' 东 西北 ', ' 东 东南 西 ', ' 南 东南 ', ' 西南 南 ', ' 东 东南 东北 ', ' 东南 西北 ',\n",
       "       ' 东 北 西 ', ' 东南 南 东 ', ' 西南 西北 ', ' 西 南 ', ' 西 东 ', ' 东 南 西 ',\n",
       "       ' 西南 东 ', ' 东南 南 西南 ', ' 东 东北 ', ' 东南 东 ', ' 北 西北 ', ' 南 西北 ',\n",
       "       ' 南 东北 ', ' 东北 南 ', ' 东北 东南 ', ' 东南 南 北 ', ' 南 西南 东北 ', ' 西南 西 南 ',\n",
       "       ' 南 东 ', ' 西北 西 ', ' 南 西北 北 ', ' 东 南 西 北 ', ' 西 东北 ',\n",
       "       ' 东 东南 西 西北 ', ' 东北 西南 ', ' 东 南 北 ', ' 东北 东南 西南 ', ' 南 北 东北 ',\n",
       "       ' 西北 东南 ', ' 西北 东北 ', ' 西北 西南 ', ' 西南 东南 ', ' 南 西南 西 ', ' 北 东南 ',\n",
       "       ' 东南 西 西北 ', ' 西 西南 西北 ', ' 西南 南 东南 东 ', ' 东 东南 西南 西 ', ' 东 西 北 ',\n",
       "       ' 东北 东 ', ' 南 东南 西南 ', ' 北 东 ', ' 东 西 南 北 ', ' 东南 西南 西北 ',\n",
       "       ' 东北 西北 ', ' 东 南 西南 ', ' 东北 西 ', ' 东南 南 西南 东 ', ' 北 西 ',\n",
       "       ' 东 东南 南 西南 ', ' 东南 西南 西 ', ' 东 东南 北 东北 ', ' 东北 北 '], dtype=object)"
      ]
     },
     "execution_count": 1020,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看朝向列种类\n",
    "data.朝向.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1021,
   "id": "c4205d34",
   "metadata": {},
   "outputs": [],
   "source": [
    "#自定义独热编码的函数\n",
    "def my_get_dummies(ser):\n",
    "    base_dirt = ['东', '南', '西', '北', '东北', '东南', '西南', '西北']\n",
    "    # np.zeros 是 NumPy 库中的一个函数，用于创建全零数组\n",
    "    base_data = np.zeros((len(ser),),dtype=np.int_)\n",
    "    df = pd.DataFrame({\n",
    "        '东':base_data,\n",
    "        '南':base_data,\n",
    "        '西':base_data, \n",
    "        '北':base_data,\n",
    "        '东北':base_data, \n",
    "        '东南':base_data, \n",
    "        '西南':base_data, \n",
    "        '西北':base_data\n",
    "    },index=ser.index)\n",
    "\n",
    "    for irec in ser.index:\n",
    "        # 分割字符串\n",
    "        rec = ser[irec].strip().split(' ')\n",
    "        # 遍历每条记录分隔后的方位\n",
    "        for dirt in rec:\n",
    "            # 检查是否存在8个基本方位以外的记录\n",
    "            if dirt not in base_dirt:\n",
    "                print('发现异常方位：', dirt)\n",
    "            else:\n",
    "                # 将对应的列值设为1\n",
    "                # df[dirt][irec] = 1\n",
    "                df.loc[irec, dirt] = 1\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1022,
   "id": "3371342c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>面积</th>\n",
       "      <th>楼层</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>关注人数</th>\n",
       "      <th>发布时间</th>\n",
       "      <th>...</th>\n",
       "      <th>板塔结合</th>\n",
       "      <th>板楼</th>\n",
       "      <th>东</th>\n",
       "      <th>南</th>\n",
       "      <th>西</th>\n",
       "      <th>北</th>\n",
       "      <th>东北</th>\n",
       "      <th>东南</th>\n",
       "      <th>西南</th>\n",
       "      <th>西北</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>91.80</td>\n",
       "      <td>1.06</td>\n",
       "      <td>86.40</td>\n",
       "      <td>中楼层(共26层)</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>6个月以前发布</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>128.50</td>\n",
       "      <td>1.94</td>\n",
       "      <td>66.12</td>\n",
       "      <td>高楼层(共33层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>2个月以前发布</td>\n",
       "      <td>...</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>153.00</td>\n",
       "      <td>2.08</td>\n",
       "      <td>73.59</td>\n",
       "      <td>中楼层(共34层)</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>58.0</td>\n",
       "      <td>15天以前发布</td>\n",
       "      <td>...</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>89.00</td>\n",
       "      <td>1.18</td>\n",
       "      <td>75.40</td>\n",
       "      <td>中楼层(共16层)</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>36.0</td>\n",
       "      <td>2个月以前发布</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>64.50</td>\n",
       "      <td>1.25</td>\n",
       "      <td>51.69</td>\n",
       "      <td>高楼层(共15层)</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1个月以前发布</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 83 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域      总价    单价     面积           楼层  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流   91.80  1.06  86.40   中楼层(共26层)    \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流  128.50  1.94  66.12   高楼层(共33层)    \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流  153.00  2.08  73.59   中楼层(共34层)    \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流   89.00  1.18  75.40   中楼层(共16层)    \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   64.50  1.25  51.69   高楼层(共15层)    \n",
       "\n",
       "     建成时间   关注人数      发布时间  ...   板塔结合     板楼  东  南  西  北  东北  东南  西南  西北  \n",
       "0  2016.0  135.0   6个月以前发布  ...  False  False  1  0  0  0   0   0   0   0  \n",
       "1  2017.0   40.0   2个月以前发布  ...   True  False  1  0  0  0   0   0   0   0  \n",
       "2  2017.0   58.0   15天以前发布  ...   True  False  0  1  0  0   0   0   0   0  \n",
       "3  2011.0   36.0   2个月以前发布  ...  False   True  1  0  0  1   0   0   0   0  \n",
       "4  2007.0   38.0   1个月以前发布  ...  False   True  0  1  0  0   0   0   0   0  \n",
       "\n",
       "[5 rows x 83 columns]"
      ]
     },
     "execution_count": 1022,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 原生独热编码\n",
    "# data = data.join(pd.get_dummies(data.朝向))\n",
    "# data.head()\n",
    "#自定义独热编码\n",
    "data = data.join(my_get_dummies(data.朝向))\n",
    "# 删除原有列\n",
    "data = data.drop('朝向', axis=1)\n",
    "# 查看数据\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1023,
   "id": "bbb0750f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "453\n"
     ]
    }
   ],
   "source": [
    "# 检测数据格式一致性\n",
    "(~data.楼层.str.contains('楼层')).sum()\n",
    "print((~data.楼层.str.contains('楼层')).sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1024,
   "id": "de572663",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32783\n"
     ]
    }
   ],
   "source": [
    "#舍弃数据\n",
    "data = data[data.楼层.str.contains('楼层')]\n",
    "print(len(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1025,
   "id": "bc65b867",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>描述</th>\n",
       "      <th>位置信息</th>\n",
       "      <th>区域</th>\n",
       "      <th>总价</th>\n",
       "      <th>单价</th>\n",
       "      <th>面积</th>\n",
       "      <th>建成时间</th>\n",
       "      <th>关注人数</th>\n",
       "      <th>发布时间</th>\n",
       "      <th>0室0厅</th>\n",
       "      <th>...</th>\n",
       "      <th>西</th>\n",
       "      <th>北</th>\n",
       "      <th>东北</th>\n",
       "      <th>东南</th>\n",
       "      <th>西南</th>\n",
       "      <th>西北</th>\n",
       "      <th>中楼层</th>\n",
       "      <th>低楼层</th>\n",
       "      <th>高楼层</th>\n",
       "      <th>总楼层</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>威兰德装修套三对中庭，客户只给契税</td>\n",
       "      <td>威兰德小镇</td>\n",
       "      <td>双流</td>\n",
       "      <td>91.80</td>\n",
       "      <td>1.06</td>\n",
       "      <td>86.40</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>6个月以前发布</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>房子清水套三户型方正采光好无遮挡，视野开阔！</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>128.50</td>\n",
       "      <td>1.94</td>\n",
       "      <td>66.12</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>2个月以前发布</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>南湖逸家满二精装房，中间楼层，采光良好</td>\n",
       "      <td>南湖逸家二期</td>\n",
       "      <td>双流</td>\n",
       "      <td>153.00</td>\n",
       "      <td>2.08</td>\n",
       "      <td>73.59</td>\n",
       "      <td>2017.0</td>\n",
       "      <td>58.0</td>\n",
       "      <td>15天以前发布</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>佰客郡精装修房子配套成熟业主真心卖</td>\n",
       "      <td>佰客郡</td>\n",
       "      <td>双流</td>\n",
       "      <td>89.00</td>\n",
       "      <td>1.18</td>\n",
       "      <td>75.40</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>36.0</td>\n",
       "      <td>2个月以前发布</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>加贝书香尚品 精装修 带家具家电出售</td>\n",
       "      <td>加贝书香尚品</td>\n",
       "      <td>双流</td>\n",
       "      <td>64.50</td>\n",
       "      <td>1.25</td>\n",
       "      <td>51.69</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1个月以前发布</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 86 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       描述     位置信息  区域      总价    单价     面积    建成时间   关注人数  \\\n",
       "0       威兰德装修套三对中庭，客户只给契税   威兰德小镇   双流   91.80  1.06  86.40  2016.0  135.0   \n",
       "1  房子清水套三户型方正采光好无遮挡，视野开阔！  南湖逸家二期   双流  128.50  1.94  66.12  2017.0   40.0   \n",
       "2     南湖逸家满二精装房，中间楼层，采光良好  南湖逸家二期   双流  153.00  2.08  73.59  2017.0   58.0   \n",
       "3       佰客郡精装修房子配套成熟业主真心卖     佰客郡   双流   89.00  1.18  75.40  2011.0   36.0   \n",
       "4      加贝书香尚品 精装修 带家具家电出售  加贝书香尚品   双流   64.50  1.25  51.69  2007.0   38.0   \n",
       "\n",
       "       发布时间  0室0厅   ...  西  北  东北  东南  西南  西北    中楼层    低楼层    高楼层  总楼层  \n",
       "0   6个月以前发布  False  ...  0  0   0   0   0   0   True  False  False   26  \n",
       "1   2个月以前发布  False  ...  0  0   0   0   0   0  False  False   True   33  \n",
       "2   15天以前发布  False  ...  0  0   0   0   0   0   True  False  False   34  \n",
       "3   2个月以前发布  False  ...  0  1   0   0   0   0   True  False  False   16  \n",
       "4   1个月以前发布  False  ...  0  0   0   0   0   0  False  False   True   15  \n",
       "\n",
       "[5 rows x 86 columns]"
      ]
     },
     "execution_count": 1025,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 提取所在楼层\n",
    "data['所在楼层'] = data.楼层.map(lambda x : x.split('(')[0])\n",
    "# 对所在楼层进行独热编码\n",
    "data = data.join(pd.get_dummies(data.所在楼层))\n",
    "# 使用正则表达式提取数据并转换为int类型\n",
    "data['总楼层'] = data.楼层.map(lambda x : int(re.findall(r'\\(共(.*?)层\\)', x)[0]))\n",
    "# 删除原有列\n",
    "data = data.drop('楼层', axis=1)\n",
    "# 删除所在楼层列\n",
    "data = data.drop('所在楼层', axis=1)\n",
    "# 查看数据\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1026,
   "id": "779838f5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['描述', '位置信息', '区域', '总价', '单价', '面积', '建成时间', '关注人数', '发布时间', '0室0厅 ', '0室1厅 ', '1室0厅 ', '1室1厅 ', '1室2厅 ', '2室0厅 ', '2室1厅 ', '2室2厅 ', '3室0厅 ', '3室1厅 ', '3室2厅 ', '3室3厅 ', '3室4厅 ', '4室0厅 ', '4室1厅 ', '4室2厅 ', '4室3厅 ', '4室4厅 ', '5室0厅 ', '5室1厅 ', '5室2厅 ', '5室3厅 ', '5室4厅 ', '6室1厅 ', '6室2厅 ', '6室3厅 ', '6室4厅 ', '7室1厅 ', '7室2厅 ', '7室3厅 ', '7室4厅 ', '7室5厅 ', '8室2厅 ', '8室3厅 ', '9室2厅 ', '双流', '大邑', '天府新区', '天府新区南区', '崇州', '彭州', '成华', '新津', '新都', '武侯', '温江', '简阳', '蒲江', '郫都', '都江堰', '金堂', '金牛', '锦江', '青白江', '青羊', '高新', '高新西', '龙泉驿', '毛坯', '简装', '精装', '塔楼', '平房', '板塔结合', '板楼', '东', '南', '西', '北', '东北', '东南', '西南', '西北', ' 中楼层', ' 低楼层', ' 高楼层', '总楼层']\n"
     ]
    }
   ],
   "source": [
    "# 获取所有列标题\n",
    "column_headers = data.columns.tolist()\n",
    "print(column_headers)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1027,
   "id": "a7c87a2e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#删除发布时间信息\n",
    "data = data.drop('发布时间',axis =1)\n",
    "#去掉列标题空白\n",
    "data = data.rename(columns = lambda x:x.strip())\n",
    "#保存数据\n",
    "output_file_path = '房产信息_预处理.xlsx'\n",
    "data.to_excel(output_file_path,index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
